{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kenlm\n",
    "from tqdm import tqdm\n",
    "import fastText\n",
    "\n",
    "modellm = kenlm.Model('kenlmmodel/yelp.arpa')\n",
    "\n",
    "classifier_model = fastText.load_model('fasttextmodel/model_yelp.bin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "396.7080016573861\n"
     ]
    }
   ],
   "source": [
    "print(modellm.perplexity('I just hate the food .'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "([['__label__0'], ['__label__1']], array([[1.00001001],\n",
      "       [0.97183311]]))\n"
     ]
    }
   ],
   "source": [
    "texts = ['worst food ever', 'very bad nice place']\n",
    "print(classifier_model.predict(texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "500it [00:00, 87798.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.88 \n",
      "Average Perplexity: 176.07\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "path_to_txt_file = './ref_files/ref1_only_predictions.txt'\n",
    "label = '__label__0'\n",
    "\n",
    "with open(path_to_txt_file) as f:\n",
    "    sentences = f.read().splitlines()\n",
    "    #content = [l.decode('utf8', 'ignore') for l in f.readlines()]\n",
    "# you may also want to remove whitespace characters like `\\n` at the end of each line\n",
    "# sentences = [x.strip() for x in sentences]\n",
    "labels = classifier_model.predict(sentences)\n",
    "\n",
    "correct_pred = 0\n",
    "perplexity_sum = 0\n",
    "\n",
    "with open(path_to_txt_file+'.log',\"w\") as text_file:\n",
    "    for _, sentence in tqdm(enumerate(sentences)):\n",
    "        #pred_label = learn_classifier.predict(sentence)[1].item()\n",
    "        pred_perplexity = modellm.perplexity(sentence)\n",
    "        perplexity_sum += pred_perplexity\n",
    "        if(labels[0][_][0]==label):\n",
    "            correct_pred+=1\n",
    "        text_file.write(sentence+'\\t'+labels[0][_][0]+'\\t'+str(round(pred_perplexity, 2))+'\\n')\n",
    "\n",
    "with open(path_to_txt_file+'.log', 'r+') as f:\n",
    "        content = f.read()\n",
    "        f.seek(0, 0)\n",
    "        accuracy = correct_pred/len(sentences)\n",
    "        perplexity_avg = round(perplexity_sum/len(sentences), 2)\n",
    "        print('Accuracy: {} \\nAverage Perplexity: {}\\n'.format(accuracy, perplexity_avg))\n",
    "        f.write('Accuracy: {} \\nAverage Perplexity: {}\\n'.format(accuracy, perplexity_avg))\n",
    "        f.write('--------------------------------------------------------------------------------------------- \\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#creating text files for glue\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('amazon_all_model_prediction_1.csv', header = None)\n",
    "\n",
    "list_sentences = df[1:len(df)].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the group of hikers is resting in front of a mountain .',\n",
       " 'the man is standing on the grass , trying to get the ball .',\n",
       " 'a skateboarder are is wearing rollerblades is in a crowd .',\n",
       " 'the large crowd is in front of a rock .',\n",
       " 'the drunk skier is stupidly stupidly in front of a mountain .',\n",
       " 'the group of boys trying to be crash in front of a mountain .',\n",
       " 'the group of hikers is resting in front of a mountain looking for bigfoot .',\n",
       " 'the group of hikers is resting in front of a mountain looking for bigfoot .',\n",
       " 'several hikers , gear , mountain in hopes of seeing a bear .']"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_sentences[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "id = 1\n",
    "for i in range(0, len(list_sentences[0])):\n",
    "    with open('./gleu_res/ref1/amazon'+str(id)+\".txt\", \"w\") as text_file:\n",
    "        for j in range(0, len(list_sentences)):\n",
    "            if(pd.isnull(list_sentences[j][i])):\n",
    "                continue\n",
    "            text_file.write(list_sentences[j][i]+\"\\n\")\n",
    "    id+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df0 = pd.read_csv('matrics_yelp_all_model_prediction_0.csv', header = None)\n",
    "df1 = pd.read_csv('matrics_yelp_all_model_prediction_1.csv', header = None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_cells = df0[1:].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_newval =[]\n",
    "for j in range(1,12):\n",
    "    temp_list=[]\n",
    "    for i in range(2,9):\n",
    "        temp_list.append(str(round(((float(df0[i][j])+float(df1[i][j]))/2.),4 )))\n",
    "    list_newval.append(temp_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "models_list = ['Source',\n",
    "  'CROSSALIGNED',\n",
    "  'STYLEEMBEDDING',\n",
    "  'MULTIDECODER',\n",
    "  'RETRIEVEONLY',\n",
    "  'TEMPLATEBASED',\n",
    "  'DELETEONLY',\n",
    "  'DELETEANDRETRIEVE',\n",
    "  'BERT_DEL',\n",
    "  'BERT_RET',\n",
    "  'HUMAN']\n",
    "\n",
    "df_res = pd.DataFrame(list_newval, columns=['GLEU', 'BLEU_source','BLEU_human','fasttext_classifier','klm_ppl','fastai_classifier','fastailm_ppl'])\n",
    "df_res.insert(loc=0, column='model', value=models_list)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model</th>\n",
       "      <th>GLEU</th>\n",
       "      <th>BLEU_source</th>\n",
       "      <th>BLEU_human</th>\n",
       "      <th>fasttext_classifier</th>\n",
       "      <th>klm_ppl</th>\n",
       "      <th>fastai_classifier</th>\n",
       "      <th>fastailm_ppl</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Source</td>\n",
       "      <td>0.0757</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.5399</td>\n",
       "      <td>0.045</td>\n",
       "      <td>73.0582</td>\n",
       "      <td>0.02</td>\n",
       "      <td>32.9232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CROSSALIGNED</td>\n",
       "      <td>0.0438</td>\n",
       "      <td>0.4561</td>\n",
       "      <td>0.3105</td>\n",
       "      <td>0.768</td>\n",
       "      <td>70.7022</td>\n",
       "      <td>0.737</td>\n",
       "      <td>165.8417</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>STYLEEMBEDDING</td>\n",
       "      <td>0.0587</td>\n",
       "      <td>0.7894</td>\n",
       "      <td>0.4378</td>\n",
       "      <td>0.095</td>\n",
       "      <td>105.7637</td>\n",
       "      <td>0.078</td>\n",
       "      <td>330.7297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>MULTIDECODER</td>\n",
       "      <td>0.0503</td>\n",
       "      <td>0.5601</td>\n",
       "      <td>0.348</td>\n",
       "      <td>0.506</td>\n",
       "      <td>159.1549</td>\n",
       "      <td>0.482</td>\n",
       "      <td>607.7182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>RETRIEVEONLY</td>\n",
       "      <td>0.0104</td>\n",
       "      <td>0.2282</td>\n",
       "      <td>0.1798</td>\n",
       "      <td>0.921</td>\n",
       "      <td>8.2492</td>\n",
       "      <td>0.923</td>\n",
       "      <td>52.2676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TEMPLATEBASED</td>\n",
       "      <td>0.0907</td>\n",
       "      <td>0.6091</td>\n",
       "      <td>0.4039</td>\n",
       "      <td>0.824</td>\n",
       "      <td>2085.4696</td>\n",
       "      <td>0.803</td>\n",
       "      <td>1451.1379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>DELETEONLY</td>\n",
       "      <td>0.0644</td>\n",
       "      <td>0.54</td>\n",
       "      <td>0.3663</td>\n",
       "      <td>0.859</td>\n",
       "      <td>100.1707</td>\n",
       "      <td>0.853</td>\n",
       "      <td>205.4436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>DELETEANDRETRIEVE</td>\n",
       "      <td>0.0686</td>\n",
       "      <td>0.5354</td>\n",
       "      <td>0.3655</td>\n",
       "      <td>0.878</td>\n",
       "      <td>118.7094</td>\n",
       "      <td>0.888</td>\n",
       "      <td>267.4751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>BERT_DEL</td>\n",
       "      <td>0.1123</td>\n",
       "      <td>0.6828</td>\n",
       "      <td>0.4658</td>\n",
       "      <td>0.708</td>\n",
       "      <td>96.6564</td>\n",
       "      <td>0.728</td>\n",
       "      <td>89.459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>BERT_RET</td>\n",
       "      <td>0.1009</td>\n",
       "      <td>0.656</td>\n",
       "      <td>0.4456</td>\n",
       "      <td>0.555</td>\n",
       "      <td>135.7993</td>\n",
       "      <td>0.582</td>\n",
       "      <td>148.1388</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>HUMAN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.532</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.672</td>\n",
       "      <td>3179.2123</td>\n",
       "      <td>0.79</td>\n",
       "      <td>386.0667</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                model    GLEU BLEU_source BLEU_human fasttext_classifier  \\\n",
       "0              Source  0.0757         1.0     0.5399               0.045   \n",
       "1        CROSSALIGNED  0.0438      0.4561     0.3105               0.768   \n",
       "2      STYLEEMBEDDING  0.0587      0.7894     0.4378               0.095   \n",
       "3        MULTIDECODER  0.0503      0.5601      0.348               0.506   \n",
       "4        RETRIEVEONLY  0.0104      0.2282     0.1798               0.921   \n",
       "5       TEMPLATEBASED  0.0907      0.6091     0.4039               0.824   \n",
       "6          DELETEONLY  0.0644        0.54     0.3663               0.859   \n",
       "7   DELETEANDRETRIEVE  0.0686      0.5354     0.3655               0.878   \n",
       "8            BERT_DEL  0.1123      0.6828     0.4658               0.708   \n",
       "9            BERT_RET  0.1009       0.656     0.4456               0.555   \n",
       "10              HUMAN     1.0       0.532        1.0               0.672   \n",
       "\n",
       "      klm_ppl fastai_classifier fastailm_ppl  \n",
       "0     73.0582              0.02      32.9232  \n",
       "1     70.7022             0.737     165.8417  \n",
       "2    105.7637             0.078     330.7297  \n",
       "3    159.1549             0.482     607.7182  \n",
       "4      8.2492             0.923      52.2676  \n",
       "5   2085.4696             0.803    1451.1379  \n",
       "6    100.1707             0.853     205.4436  \n",
       "7    118.7094             0.888     267.4751  \n",
       "8     96.6564             0.728       89.459  \n",
       "9    135.7993             0.582     148.1388  \n",
       "10  3179.2123              0.79     386.0667  "
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_res.to_csv('matrics_average.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "mat1 = np.array([\n",
    "0.076314,\n",
    "0.044249,\n",
    "0.059500,\n",
    "0.047856,\n",
    "0.008023,\n",
    "0.001988,\n",
    "0.063792,\n",
    "0.071165,\n",
    "0.122037,\n",
    "0.125641,\n",
    "0.100860,\n",
    "0.092407,\n",
    "0.110423,\n",
    "0.110237,\n",
    "0.107167,\n",
    "0.109255,\n",
    "0.104249,\n",
    "0.109463,\n",
    "0.015993,\n",
    "0.040456,\n",
    "0.000000,\n",
    "0.000000,\n",
    "0.102796,\n",
    "0.015269,\n",
    "1.000000\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "mat0 = np.array([\n",
    "0.075047,\n",
    "0.043375,\n",
    "0.057970,\n",
    "0.052730,\n",
    "0.012828,\n",
    "0.082611,\n",
    "0.065006,\n",
    "0.066123,\n",
    "0.109153,\n",
    "0.093038,\n",
    "0.100581,\n",
    "0.084421,\n",
    "0.114485,\n",
    "0.090462,\n",
    "0.104663,\n",
    "0.093399,\n",
    "0.102966,\n",
    "0.094599,\n",
    "0.019721,\n",
    "0.034744,\n",
    "0.004677,\n",
    "0.000000,\n",
    "0.096695,\n",
    "0.012745,\n",
    "1.000000\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "matavg = (mat0+mat1)/2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.076314\n",
      "0.044249\n",
      "0.0595\n",
      "0.047856\n",
      "0.008023\n",
      "0.001988\n",
      "0.063792\n",
      "0.071165\n",
      "0.122037\n",
      "0.125641\n",
      "0.10086\n",
      "0.092407\n",
      "0.110423\n",
      "0.110237\n",
      "0.107167\n",
      "0.109255\n",
      "0.104249\n",
      "0.109463\n",
      "0.015993\n",
      "0.040456\n",
      "0.0\n",
      "0.0\n",
      "0.102796\n",
      "0.015269\n",
      "1.0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[print(i) for i in mat1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "dictofppl = {'Source': 179.98222876548766,\n",
    "'CROSSALIGNED': 368.56972133636475,\n",
    "'STYLEEMBEDDING': 449.12290202522274,\n",
    "'MULTIDECODER': 730.3358956260681,\n",
    "'RETRIEVEONLY': 208.2480753135681,\n",
    "'TEMPLATEBASED': 385.7471610374451,\n",
    "'DELETEONLY': 336.44651431274417,\n",
    "'DELETEANDRETRIEVE': 425.2543611755371,\n",
    "'BERT_DEL': 223.99889970397948,\n",
    "'SEL_DEL': 201.02439992332458,\n",
    "'BERT_RET_USE': 301.5724952011108,\n",
    "'SAL_RET_USE': 256.1212079248428,\n",
    "'BERT_RET_TFIDF': 313.81375119018554,\n",
    "'SAL_RET_TFIDF': 303.5601326875686,\n",
    "'BERT_RET_GLOVE': 353.2743957939148,\n",
    "'SAL_RET_GLOVE': 307.8996427268982,\n",
    "'BERT_RET_RANDOM': 415.94113478660586,\n",
    "'SAL_RET_RANDOM': 265.90657262611387,\n",
    "'RET_ONLY_USE': 188.06904032325744,\n",
    "'RET_ONLY_TFIDF': 269.54081302833555,\n",
    "'RET_ONLY_GLOVE': 310.337988117218,\n",
    "'RET_ONLY_RANDOM': 227.66278438282012,\n",
    "'HUMAN': 140.11555947446823}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "179.98222876548766\n",
      "368.56972133636475\n",
      "449.12290202522274\n",
      "730.3358956260681\n",
      "208.2480753135681\n",
      "385.7471610374451\n",
      "336.44651431274417\n",
      "425.2543611755371\n",
      "223.99889970397948\n",
      "201.02439992332458\n",
      "301.5724952011108\n",
      "256.1212079248428\n",
      "313.81375119018554\n",
      "303.5601326875686\n",
      "353.2743957939148\n",
      "307.8996427268982\n",
      "415.94113478660586\n",
      "265.90657262611387\n",
      "188.06904032325744\n",
      "269.54081302833555\n",
      "310.337988117218\n",
      "227.66278438282012\n",
      "140.11555947446823\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None,\n",
       " None]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[print(dictofppl[i]) for i in dictofppl.keys()]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
